Acknowledgement: this report has been inspired by materials used in the Advanced Bioinformatics course taught by Igor Ruiz de los Mozos at UPNA.

1 Data preparation

1.1 Load dataset

After completing the nf-core/rnaseq pipeline using Salmon pseudoalignment for the analysis of 44 colorectal samples from SRA study SRP479528 will obtain the salmon.merged.gene.SummarizedExperiment.rds file. This file contains gene-level quantification matrices (counts, TPMs, effective lengths) plus sample metadata and annotations produced from the merged Salmon quantifications.

Let’s load the salmon.merged.gene.SummarizedExperiment.rds file. The content of the file has a SummarizedExperiment representation, which is an R/Bioconductor container that stores assay matrices (e.g., counts), feature metadata, and sample metadata in one coherent object.

# Specify a working directory
directory <- "/Volumes/DataWinBackup/00_MASTER UOC BIOINFORMATICA - TEMPORAL/TFM-UOC"
setwd(directory)
# list.files()

# Read 'se' object obtained from nfcore/rnaseq
se <- readRDS("salmon.merged.gene.SummarizedExperiment.rds")

# Display se object
print(se)
class: SummarizedExperiment 
dim: 78428 44 
metadata(0):
assays(5): counts counts_length_scaled counts_scaled lengths tpm
rownames(78428): ENSG00000000003 ENSG00000000005 ... ENSG00000310592
  ENSG00000310593
rowData names(3): transcript_id gene_id gene_name
colnames(44): srr27320655_rectum_control srr27320656_rectum_stage1 ...
  srr27320697_sigmoid_control srr27320698_sigmoid_stage2
colData names(4): sample fastq_1 fastq_2 strandedness

We focus on the counts assay matrix and check information on columns and rows.

# Check the first elements of the counts matrix
head(assay(se, "counts")[, 1:2], 5)
                srr27320655_rectum_control srr27320656_rectum_stage1
ENSG00000000003                   1397.946                  6337.771
ENSG00000000005                      6.000                    79.000
ENSG00000000419                    633.000                  4414.000
ENSG00000000457                    434.061                   740.999
ENSG00000000460                     88.000                   662.000
# Collect sample information
head(colData(se)[, 2:3], 5)
DataFrame with 5 rows and 2 columns
                                                  fastq_1
                                              <character>
srr27320655_rectum_control         data/reads/SRR273206..
srr27320656_rectum_stage1          data/reads/SRR273206..
srr27320657_hepaticflexure_control data/reads/SRR273206..
srr27320658_hepaticflexure_stage2  data/reads/SRR273206..
srr27320659_sigmoid_control        data/reads/SRR273206..
                                                  fastq_2
                                              <character>
srr27320655_rectum_control         data/reads/SRR273206..
srr27320656_rectum_stage1          data/reads/SRR273206..
srr27320657_hepaticflexure_control data/reads/SRR273206..
srr27320658_hepaticflexure_stage2  data/reads/SRR273206..
srr27320659_sigmoid_control        data/reads/SRR273206..
# Collect gene information
head(rowData(se), 3)
DataFrame with 3 rows and 3 columns
                         transcript_id         gene_id   gene_name
                           <character>     <character> <character>
ENSG00000000003 ENST00000373020,ENST.. ENSG00000000003      TSPAN6
ENSG00000000005 ENST00000373031,ENST.. ENSG00000000005        TNMD
ENSG00000000419 ENST00000466152,ENST.. ENSG00000000419        DPM1
# Collect genomic ranges
head(rowRanges(se), 3)
NULL

Below, we can obtain the name of the 44 samples.

# Display colNames
colData(se)$sample
 [1] "srr27320655_rectum_control"          "srr27320656_rectum_stage1"          
 [3] "srr27320657_hepaticflexure_control"  "srr27320658_hepaticflexure_stage2"  
 [5] "srr27320659_sigmoid_control"         "srr27320660_sigmoid_stage4"         
 [7] "srr27320661_cecum_control"           "srr27320662_cecum_stage1"           
 [9] "srr27320663_sigmoid_control"         "srr27320664_sigmoid_stage2"         
[11] "srr27320665_sigmoid_control"         "srr27320666_sigmoid_stage2"         
[13] "srr27320667_rectum_control"          "srr27320668_rectum_stage1"          
[15] "srr27320669_sigmoid_control"         "srr27320670_sigmoid_stage4"         
[17] "srr27320671_rectum_control"          "srr27320672_rectum_stage1"          
[19] "srr27320673_rectum_control"          "srr27320674_rectum_stage1"          
[21] "srr27320675_cecum_control"           "srr27320676_cecum_stage3"           
[23] "srr27320677_rectosigmoid_control"    "srr27320678_rectosigmoid_stage3"    
[25] "srr27320679_rectum_control"          "srr27320680_rectum_stage1"          
[27] "srr27320681_sigmoid_control"         "srr27320682_sigmoid_stage1"         
[29] "srr27320683_sigmoid_control"         "srr27320684_sigmoid_stage2"         
[31] "srr27320685_rectum_control"          "srr27320686_rectum_stage4"          
[33] "srr27320687_rectum_control"          "srr27320688_rectum_stage2"          
[35] "srr27320689_sigmoid_control"         "srr27320690_sigmoid_stage1"         
[37] "srr27320691_descendingcolon_control" "srr27320692_descendingcolon_stage2" 
[39] "srr27320693_rectum_control"          "srr27320694_rectum_stage3"          
[41] "srr27320695_cecum_control"           "srr27320696_cecum_stage3"           
[43] "srr27320697_sigmoid_control"         "srr27320698_sigmoid_stage2"         

Since the colData field in the se object is limited to data from the original FASTQ files, it would be useful to supplement it with metadata from the SRA study, such as patient age and tissue type.

1.2 Preparation of .csv files

In this section we describe the steps to add metadata from the SRA study into the colData field of the SummarizedExperiment object obtained from the nf-core/rnaseq pipeline:

  • Go to SRA study SRP479528
  • Download metadata of 44 runs
  • Open the downloaded SraRunTable.csv file in Google Sheets
  • Convert the spreadsheet into a table
  • Order data by run number SRR27320655, SRR27320656, SRR27320657…
  • Download it as .csv file and rename it sraruntable_srp479528_crc44.csv

Here are the first lines of the sraruntable_srp479528_crc44.csv file.

Run,age_at_surgery,tissue,tumor_stage,Sample Name,SRA Study,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Collection_Date,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,geo_loc_name_country,geo_loc_name_country_continent,geo_loc_name,Instrument,Library Name,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Column 1,source_name
SRR27320655,88.75,Rectum,NA,GSM7989032,SRP479528,RNA-Seq,117,2304341839,PRJNA1055547,SAMN39058850,870996594,"BIOCHEMISTRY, COLORECTAL SURGERY, PENN STATE COLLEGE OF MEDICINE",missing,public,"fastq,run.zq,sra","ncbi,gs,s3","s3.us-east-1,ncbi.public,gs.us-east1",SRX22997924,uncalculated,uncalculated,missing,Illumina NovaSeq 6000,GSM7989032,PAIRED,cDNA,TRANSCRIPTOMIC,Homo sapiens,ILLUMINA,2024-04-11T00:00:00Z,2023-12-21T21:33:00Z,1,,Rectum

As follows, a metadata .csv file was created using information from BioProject PRJNA1055547. Here are the first lines of the created sraruntable_srp479528_crc44_metadata.csv file.

key,values
abstract,"The incidence of colorectal cancer (CRC) has been steadily increasing in younger individuals over the past several decades for reasons that are incompletely defined..."
Accession,PRJNA1055547; GEO: GSE251845
Data Type,Transcriptome or Gene expression

1.3 Update SummarizedExperiment object

Let’s add metadata contained in the previously created sraruntable_srp479528_crc44.csv and sraruntable_srp479528_crc44_metadata.csv files into the se object.

After loading the .csv files, here we can see the samples metadata represented as a dataframe named df_coldata:

# Load files
df_coldata <- read.csv("sraruntable_srp479528_crc44.csv", header = TRUE, row.names = 1,
    check.names = FALSE)

# Display first rows
head(df_coldata[, 1:5], n = 3)
            age_at_surgery          tissue tumor_stage Sample Name SRA Study
SRR27320655          88.75          Rectum          NA  GSM7989032 SRP479528
SRR27320656          88.75          Rectum           1  GSM7989031 SRP479528
SRR27320657          68.00 Hepatic flexure          NA  GSM7989030 SRP479528

and below is the metadata corresponding to the SRA study represented as a dataframe named df_metadata with two columns: key and values.

# Load files
df_metadata <- read.csv("sraruntable_srp479528_crc44_metadata.csv", header = TRUE,
    check.names = FALSE)

# Display first rows
head(df_metadata, n = 3)
        key
1  abstract
2 Accession
3 Data Type
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      values
1 The incidence of colorectal cancer (CRC) has been steadily increasing in younger individuals over the past several decades for reasons that are incompletely defined. Identifying differences in gene expression profiles, or transcriptomes, in early-onset colorectal cancer (EOCRC, < 50 years old) patients versus later-onset colorectal cancer (LOCRC, > 50 years old) patients is one approach to understanding molecular and genetic features that distinguish EOCRC. Methods: We performed RNA-sequencing (RNA-seq) to characterize the transcriptomes of patient-matched tumors and adjacent, uninvolved (normal) colonic segments from EOCRC (n=21) and LOCRC (n=22) patients. The EOCRC and LOCRC cohorts were matched for demographic and clinical characteristics. We used The Cancer Genome Atlas Colon Adenocarcinoma (TCGA-COAD) database for validation. We used a series of computational and bioinformatic tools to identify EOCRC-specific differentially expressed genes, molecular pathways, predicted cell populations, differential gene splicing events, and predicted neoantigens. Results: We identified an eight-gene signature in EOCRC comprised of ALDOB, FBXL16, IL1RN, MSLN, RAC3, SLC38A11, WBSCR27 and WNT11, from which we developed a score predictive of overall CRC patient survival. On the entire set of genes identified in normal tissues and tumors, cell type deconvolution analysis predicted a differential abundance of immune and non-immune populations in EOCRC versus LOCRC. Gene set enrichment analysis identified increased expression of splicing machinery in EOCRC. We further found differences in alternative splicing (AS) events, including one within the long non-coding RNA, HOTAIRM1. Additional analysis of AS found seven events specific to EOCRC that encode potential neoantigens. Conclusion: Our transcriptome analyses identified genetic and molecular features specific to EOCRC which may inform future screening, development of prognostic indicators, and novel drug targets. Overall design: Gene expression profiles of 22 surgically resected tumors and patient-matched adjacent colonic segments from colorectal cancer patients were generated with RNA-sequencing.
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               PRJNA1055547; GEO: GSE251845
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Transcriptome or Gene expression

The lower_underscore_nospaces() function helped us standardize the format of character-type data.

# Define the lower_underscore_nospaces() function
lower_underscore_nospaces <- function(df) {
    non_numeric_col <- names(df)[!sapply(df, is.numeric)]
    for (col in non_numeric_col) {
        df[[col]] <- tolower(df[[col]])  # lowercase
        df[[col]] <- gsub("[^a-z0-9]+", "_", df[[col]])  # replace non-alphanumeric with underscores
        df[[col]] <- gsub("_$", "", df[[col]])  # remove trailing underscores
    }
    if (!is.null(names(df))) {
        names(df) <- tolower(names(df))
        names(df) <- gsub("[^a-z0-9]+", "_", names(df))
        names(df) <- gsub("_$", "", names(df))
    }
    if (!is.null(rownames(df))) {
        rownames(df) <- tolower(rownames(df))
        rownames(df) <- gsub("[^a-z0-9]+", "_", rownames(df))
        rownames(df) <- gsub("_$", "", rownames(df))
    }
    return(df)
}

# Apply lower_underscore_nospaces to all datasets
df_coldata <- lower_underscore_nospaces(df_coldata)

# Display last changes on dataframes
head(df_coldata[, 1:5], n = 3)
            age_at_surgery          tissue tumor_stage sample_name sra_study
srr27320655          88.75          rectum          NA  gsm7989032 srp479528
srr27320656          88.75          rectum           1  gsm7989031 srp479528
srr27320657          68.00 hepatic_flexure          NA  gsm7989030 srp479528

Since the metadata(se) field in a SummarizedExperiment object stores data as a named list, let’s convert df_metadata to this format.

# Convert metadata dataframe into a named list
list_metadata <- as.list(df_metadata$values)
names(list_metadata) <- df_metadata$key
list_metadata
$abstract
[1] "The incidence of colorectal cancer (CRC) has been steadily increasing in younger individuals over the past several decades for reasons that are incompletely defined. Identifying differences in gene expression profiles, or transcriptomes, in early-onset colorectal cancer (EOCRC, < 50 years old) patients versus later-onset colorectal cancer (LOCRC, > 50 years old) patients is one approach to understanding molecular and genetic features that distinguish EOCRC. Methods: We performed RNA-sequencing (RNA-seq) to characterize the transcriptomes of patient-matched tumors and adjacent, uninvolved (normal) colonic segments from EOCRC (n=21) and LOCRC (n=22) patients. The EOCRC and LOCRC cohorts were matched for demographic and clinical characteristics. We used The Cancer Genome Atlas Colon Adenocarcinoma (TCGA-COAD) database for validation. We used a series of computational and bioinformatic tools to identify EOCRC-specific differentially expressed genes, molecular pathways, predicted cell populations, differential gene splicing events, and predicted neoantigens. Results: We identified an eight-gene signature in EOCRC comprised of ALDOB, FBXL16, IL1RN, MSLN, RAC3, SLC38A11, WBSCR27 and WNT11, from which we developed a score predictive of overall CRC patient survival. On the entire set of genes identified in normal tissues and tumors, cell type deconvolution analysis predicted a differential abundance of immune and non-immune populations in EOCRC versus LOCRC. Gene set enrichment analysis identified increased expression of splicing machinery in EOCRC. We further found differences in alternative splicing (AS) events, including one within the long non-coding RNA, HOTAIRM1. Additional analysis of AS found seven events specific to EOCRC that encode potential neoantigens. Conclusion: Our transcriptome analyses identified genetic and molecular features specific to EOCRC which may inform future screening, development of prognostic indicators, and novel drug targets. Overall design: Gene expression profiles of 22 surgically resected tumors and patient-matched adjacent colonic segments from colorectal cancer patients were generated with RNA-sequencing."

$Accession
[1] "PRJNA1055547; GEO: GSE251845"

$`Data Type`
[1] "Transcriptome or Gene expression"

$Scope
[1] "Multiisolate"

$Organism
[1] "Homo sapiens[Taxonomy ID: 9606]Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo; Homo sapiens"

$Publications
[1] "Marx OM et al., \"Identification of differentially expressed genes and splicing events in early-onset colorectal cancer.\", Front Oncol, 2024;14:1365762"

$Grants
[1] "\"Wnt/beta-catenin signaling in early-onset colorectal cancer\" (Grant ID R03 CA279861, National Cancer Institute)"

$Submission
[1] "Registration date: 21-Dec-2023\nBiochemistry, Colorectal Surgery, Penn State College of Medicine"

$Relevance
[1] "Medical"

Now let’s convert R regular dataframes into Bioconductor S4 DataFrame objects.

# Convert regular R dataframes to Bioconductor S4 DataFrame objects
colData <- DataFrame(df_coldata)

# Visualize data converted to S4 DataFrame S4 type
print(colData)
DataFrame with 44 rows and 32 columns
            age_at_surgery          tissue tumor_stage sample_name   sra_study
                 <numeric>     <character>   <integer> <character> <character>
             assay_type avgspotlen      bases   bioproject    biosample
            <character>  <integer>  <numeric>  <character>  <character>
                 bytes            center_name collection_date     consent
             <integer>            <character>     <character> <character>
            datastore_filetype datastore_provider       datastore_region
                   <character>        <character>            <character>
             experiment geo_loc_name_country geo_loc_name_country_continent
            <character>          <character>                    <character>
            geo_loc_name            instrument library_name librarylayout
             <character>           <character>  <character>   <character>
            libraryselection  librarysource     organism    platform
                 <character>    <character>  <character> <character>
                     releasedate          create_date   version     source_name
                     <character>          <character> <integer>     <character>
 [ reached 'max' / getOption("max.print") -- omitted 11 rows ]

and verify that their dimensions match.

# Check dimmensions of each dataset
cat("Dimensions of counts assay:", dim(assay(se, "counts")), "\n", "Dimensions of colData:",
    dim(colData), "\n", "Dimensions of metadata:", length(list_metadata), "\n")
Dimensions of counts assay: 78428 44 
 Dimensions of colData: 44 32 
 Dimensions of metadata: 9 

Finally, let’s update the SummarizedExperiment object to include information on colData and metadata fields.

# Update SE object
colData(se) <- colData

# Add metadata
metadata(se) <- list_metadata
se
class: SummarizedExperiment 
dim: 78428 44 
metadata(9): abstract Accession ... Submission Relevance
assays(5): counts counts_length_scaled counts_scaled lengths tpm
rownames(78428): ENSG00000000003 ENSG00000000005 ... ENSG00000310592
  ENSG00000310593
rowData names(3): transcript_id gene_id gene_name
colnames(44): srr27320655 srr27320656 ... srr27320697 srr27320698
colData names(32): age_at_surgery tissue ... version source_name

We’re on the right track. To move on, let’s include a column named condition, which will convert the tumor_stage column of colData into a binary factor variable with levels normal and tumor. A good practice in R is that the first level be the reference one (e.g. normal, control, or untreated samples). This will come handy when conducting the differential expression analysis with the volcano plots.

# Create new colData column named condition
colData(se)$condition <- ifelse(is.na(colData(se)$tumor_stage), "normal", "tumor")
colData(se)$condition <- as.factor(colData(se)$condition)
colData(se)$condition
 [1] normal tumor  normal tumor  normal tumor  normal tumor  normal tumor 
[11] normal tumor  normal tumor  normal tumor  normal tumor  normal tumor 
[21] normal tumor  normal tumor  normal tumor  normal tumor  normal tumor 
[31] normal tumor  normal tumor  normal tumor  normal tumor  normal tumor 
[41] normal tumor  normal tumor 
Levels: normal tumor

We can now save the se object for reuse, eliminating the need to rerun this section.

# Save the updated `se` object to a new file
saveRDS(se, file = "gene.SummarizedExperiment.metadata.rds")

1.4 Create a DESeqDataSet data object

A DESeqDataSet (dds) object encapsulates all components needed for DE analysis. It ensures statistical correctness and handles data integrity by combining:

  • Count data (from assays)
  • Sample metadata (from colData)
  • Experimental design (formula)
  • Gene metadata (from rowData)

The first step to convert our se into DESeqDataSet is to ensure that count data from the assay be a matrix data type. Currently, assay(se) is dataframe class containing lists.

# Check class and mode of each assay in se
sapply(assays(se), class)
              counts counts_length_scaled        counts_scaled 
        "data.frame"         "data.frame"         "data.frame" 
             lengths                  tpm 
        "data.frame"         "data.frame" 
sapply(assays(se), mode)
              counts counts_length_scaled        counts_scaled 
              "list"               "list"               "list" 
             lengths                  tpm 
              "list"               "list" 

Let’s convert assay(se, "counts") into a numeric matrix and round count data.

# Extract the counts assay as a matrix
counts_mat <- as.matrix(assay(se, "counts"))
assay(se, "counts") <- round(counts_mat)

# Keep only the 'counts' assay
assays(se) <- assays(se)["counts"]

# Check class and mode of each assay in se
sapply(assays(se), class)
     counts  
[1,] "matrix"
[2,] "array" 
sapply(assays(se), mode)
   counts 
"numeric" 

Now, we’ll remove genes with less than 10 reads on all samples. This practice will save computational resources when conducting the DE analysis. It is preferable to run this task before converting from se to a DESeqDataSet due to memory constraints.

gene_count_10plus <- rowSums(assay(se)) > 10
se_filtered <- se[gene_count_10plus, ]
se_filtered
class: SummarizedExperiment 
dim: 37239 44 
metadata(9): abstract Accession ... Submission Relevance
assays(1): counts
rownames(37239): ENSG00000000003 ENSG00000000005 ... ENSG00000310592
  ENSG00000310593
rowData names(3): transcript_id gene_id gene_name
colnames(44): srr27320655 srr27320656 ... srr27320697 srr27320698
colData names(33): age_at_surgery tissue ... source_name condition

Finally, we construct DESeqDataSet considering condition (normal vs. tumor) as part of our experimental design.

# Create a DESeqDataSet data object
deseq2_raw <- DESeqDataSet(se_filtered, design = ~condition)

# Display first lines of DESeqDataSet data object
class(deseq2_raw)
[1] "DESeqDataSet"
attr(,"package")
[1] "DESeq2"
head(counts(deseq2_raw)[, 1:5], n = 3)
                srr27320655 srr27320656 srr27320657 srr27320658 srr27320659
ENSG00000000003        1398        6338        1824        1224         944
ENSG00000000005           6          79          38           3         299
ENSG00000000419         633        4414         916        1017         637

In essence, we have the same data but now it is a DESeqDataSet data object.

2 Exploratory Data Analyisis (EDA)

In this section, we explore in more detail the content of the DESeqDataSet data object.

# Display DESeqDataSet data object
deseq2_raw
class: DESeqDataSet 
dim: 37239 44 
metadata(10): abstract Accession ... Relevance version
assays(1): counts
rownames(37239): ENSG00000000003 ENSG00000000005 ... ENSG00000310592
  ENSG00000310593
rowData names(3): transcript_id gene_id gene_name
colnames(44): srr27320655 srr27320656 ... srr27320697 srr27320698
colData names(33): age_at_surgery tissue ... source_name condition
# Show first lines
head(counts(deseq2_raw)[, 1:5], n = 3)
                srr27320655 srr27320656 srr27320657 srr27320658 srr27320659
ENSG00000000003        1398        6338        1824        1224         944
ENSG00000000005           6          79          38           3         299
ENSG00000000419         633        4414         916        1017         637
# Validate NAs
X <- counts(deseq2_raw)
X <- as.data.frame(X)  # convert to data.frame 
print(table(is.na(X)))

  FALSE 
1638516 

The dataset comprises 37,239 genes and 44 samples, for a total of 1,638,516 read counts, and contains no missing values (NAs).

2.1 Normalization by sequencing depth

To compare raw counts across 44 samples in a DESeq2 analysis, we need to account for library size differences. This is where normalization by sequencing depth comes handy to correct for differences in total read counts per sample. This is the baseline normalization for differential expression testing.

Let’s calculate the size factor per sample.

# Display raw counts
head(counts(deseq2_raw)[, 1:5], 5)
                srr27320655 srr27320656 srr27320657 srr27320658 srr27320659
ENSG00000000003        1398        6338        1824        1224         944
ENSG00000000005           6          79          38           3         299
ENSG00000000419         633        4414         916        1017         637
ENSG00000000457         434         741         604         553         274
ENSG00000000460          88         662         108         381          47
# Sum all counts per sample counts(deseq2_raw) is equivalent to assay(se,
# 'counts') from SummarizedExperiment print(colSums(counts(deseq2_raw))[1:5])

# Calculate the size factor and add it to the dataset It normalizes for
# sequencing depth differences to compare expression between samples fairly
# DESeq2 will automatically account for these depth differences in all
# downstream analyses using these size factors.
deseq2_raw <- estimateSizeFactors(deseq2_raw)
print(sizeFactors(deseq2_raw)[1:5])
srr27320655 srr27320656 srr27320657 srr27320658 srr27320659 
  0.6272915   1.2203391   1.1606778   1.0376846   0.7746979 

Remember that,

  • Size factor = 1.0: Average sequencing depth
  • Size factor < 1.0: Lower than average sequencing depth
  • Size factor > 1.0: Higher than average sequencing depth

The sizeFactor column was added to the coldData field of the DESeqDataSet data object.

# Check that the sizeFactor column was added to colData
deseq2_raw
class: DESeqDataSet 
dim: 37239 44 
metadata(10): abstract Accession ... Relevance version
assays(1): counts
rownames(37239): ENSG00000000003 ENSG00000000005 ... ENSG00000310592
  ENSG00000310593
rowData names(3): transcript_id gene_id gene_name
colnames(44): srr27320655 srr27320656 ... srr27320697 srr27320698
colData names(34): age_at_surgery tissue ... condition sizeFactor

Now, all raw counts across 44 samples have accounted for library size differences.

# Now take a look at the sum of the total depth after normalization
# print(colSums(counts(deseq2_raw, normalized=TRUE))[1:5])

# To retrieve normalized read counts
counts_normalized <- counts(deseq2_raw, normalized = TRUE)

# Display normalized counts
head(counts_normalized[, 1:5], 5)
                srr27320655 srr27320656 srr27320657 srr27320658 srr27320659
ENSG00000000003 2228.629003  5193.63847  1571.49553 1179.549163  1218.53955
ENSG00000000005    9.564931    64.73611    32.73949    2.891052   385.95691
ENSG00000000419 1009.100257  3617.02749   789.19403  980.066584   822.25603
ENSG00000000457  691.863367   607.20828   520.38558  532.917228   353.68627
ENSG00000000460  140.285660   542.47218    93.04908  367.163588    60.66881

2.2 By Log2 transformation

# Log2 normalization adding 1 pseudocount
counts_log_normalized <- log2(counts_normalized + 1)

# Create a plot region of 1x2 par(mfrow=c(1,2))

# Boxplot of non-transformed read counts per sample
boxplot(counts_normalized, notch = TRUE, las = 2, cex.axis = 0.7, col = "lightblue",
    main = "non-transformed  read  counts", ylab = "read counts")

# Boxplot of log2-transformed read counts
boxplot(counts_log_normalized, notch = TRUE, las = 2, cex.axis = 0.7, col = "lightblue",
    main = "log2-transformed read counts", ylab = "log2(read counts)")

2.3 Data Transformation

For RNA-seq counts, the expected variance grows with the mean, making low-count genes contribute excessive noise or making high-count genes dominate. Therefore, it is required to implement variance stabilization so that each gene contributes equally to overall variance patterns making sample relationships and biological signals easier to detect.

# If needed, install the vsn package
BiocManager::install("vsn")  # Safe to run even if already installed
library(vsn)

# If needed, install the ggplot2 package
BiocManager::install("ggplot2", force = TRUE)

The downloaded binary packages are in
    /var/folders/9f/1hd8_p6j2zqf22s70l1y0tbh0000gn/T//RtmpGg3gIr/downloaded_packages
library(ggplot2)

# BiocManager::install('hexbin')
library("hexbin")

DESeq2 offers two transformations to stabilize variance and thus transform data to become approximately homoskedastic (to see a flat trend in the meanSdPlot): the variance stabilizing transformation (VST) for negative binomial data with a dispersion-mean trend (Anders and Huber 2010), and the regularized-logarithm transformation or rlog (Love, Huber, and Anders 2014).

2.3.1 Variance Stabilizing Transformation (VST) and Regularized-logarithm Transformation (rlog)

“VST is much faster to compute and is less sensitive to high count outliers than the rlog. Also, VST is recommended for medium-to-large datasets (n > 30). rlog tends to work well on small datasets (n < 30), potentially outperforming VST when there is a wide range of sequencing depth across samples (an order of magnitude difference.)”

# Implement VST
counts_vst <- vst(deseq2_raw, blind = TRUE)
# counts_vst is a DESeq2 object type head(assay(counts_vst), 3)

# Implement rlog
counts_rlog <- rlog(deseq2_raw, blind = TRUE)
# counts_rlog is a DESeq2 object type head(assay(counts_rlog), 3)
# Load gridExtra to arrange ggplots into a single figure
library(gridExtra)

# Plot the mean vs variance of genes (37,239) across all samples (44)
# non-transformed read counts
p1 <- meanSdPlot(counts_normalized, ranks = FALSE, plot = FALSE)$gg + ggtitle("Sequencing depth normalized") +
    ylab("standard deviation")

# log2-transformed read counts
p2 <- meanSdPlot(counts_log_normalized, ranks = FALSE, plot = FALSE)$gg + ggtitle("Seq depth norm log2(read counts)") +
    ylab("standard deviation")

# Variance Stabilizing Transformation (VST)
p3 <- meanSdPlot(assay(counts_vst), ranks = FALSE, plot = FALSE)$gg + ggtitle("VST(read counts)") +
    ylab("standard deviation")

# Regularized-logarithm Transformation (rlog)
p4 <- meanSdPlot(assay(counts_rlog), ranks = FALSE, plot = FALSE)$gg + ggtitle("rlog(read counts)") +
    ylab("standard deviation")

# Arrange subplots
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

2.4 PCA & Hierarchical Clustering

We’ll continue an Exploratory Data Analysis (EDA) considering variance-stabilized-transformed counts.

# If needed, install the pheatmap and RcolorBrewer packages
install.packages("pheatmap")

The downloaded binary packages are in
    /var/folders/9f/1hd8_p6j2zqf22s70l1y0tbh0000gn/T//RtmpGg3gIr/downloaded_packages
install.packages("RColorBrewer")

The downloaded binary packages are in
    /var/folders/9f/1hd8_p6j2zqf22s70l1y0tbh0000gn/T//RtmpGg3gIr/downloaded_packages
# Load libraries
library(pheatmap)
library(RColorBrewer)

# Calculate samples' distances using VST counts
vst_dist <- dist(t(assay(counts_vst)))

# Transform samples' distances to matrix
vst_dist_matrix <- as.matrix(vst_dist)

# Modify vst_dist_matrix row names by joining 'condition' and 'tissue'
rownames(vst_dist_matrix) <- paste(counts_vst$condition, counts_vst$tissue, sep = " - ")

# Remove vst_dist_matrix col names
colnames(vst_dist_matrix) <- NULL

# Create a smooth blue color gradient with 255 shades
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)

# Draw heatmap
heatmap <- pheatmap(vst_dist_matrix, clustering_distance_rows = vst_dist, clustering_distance_cols = vst_dist,
    col = colors)

They often reflect: biological heterogeneity, batch effects, cell-type composition differences, high expression strength, and technical noise.

# Select the Top 20 variable genes Computes variance per gene across all
# samples
top20_var_genes <- head(order(rowVars(assay(counts_vst)), decreasing = TRUE), 20)

# Transform into a matrix and use vsd transformation
mat <- assay(counts_vst)[top20_var_genes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(counts_vst)[, c("tissue", "condition")])
pheatmap(mat, annotation_col = anno)

# Plot PCA on condition type plotPCA(counts_vst) plotPCA(counts_vst,
# intgroup=c('tissue','condition'))

# Plot PCA on tissue type plotPCA(counts_vst, intgroup='tissue')

# Plot PCA on both condition and tissue type plotPCA(counts_vst,
# intgroup=c('condition', 'tissue'))
install.packages("ggfortify")

The downloaded binary packages are in
    /var/folders/9f/1hd8_p6j2zqf22s70l1y0tbh0000gn/T//RtmpGg3gIr/downloaded_packages
library(ggfortify)

# Calculate prcomp()
counts_vst_mat <- assay(counts_vst)
pca <- prcomp(t(counts_vst_mat), scale. = TRUE)
summary(pca)
Importance of components:
                           PC1      PC2      PC3      PC4      PC5      PC6
Standard deviation     84.2549 55.67822 49.82212 41.97839 36.31965 34.34192
                            PC7      PC8      PC9     PC10     PC11     PC12
Standard deviation     32.61469 30.56473 30.28194 30.06764 27.86170 27.13550
                           PC13     PC14    PC15     PC16     PC17     PC18
Standard deviation     26.33750 26.23412 25.6731 25.49211 24.39336 24.26102
                           PC19     PC20     PC21    PC22     PC23    PC24
Standard deviation     24.01320 23.83034 23.40249 23.3180 22.82627 22.6711
                           PC25     PC26     PC27    PC28     PC29     PC30
Standard deviation     22.51366 22.17801 21.91305 21.7511 21.54273 21.10420
                           PC31     PC32     PC33     PC34     PC35     PC36
Standard deviation     20.82878 20.72855 20.20168 20.07219 19.90433 19.76143
                          PC37     PC38     PC39     PC40     PC41     PC42
Standard deviation     19.5871 19.25004 19.04538 18.86137 18.76491 18.59966
                           PC43      PC44
Standard deviation     18.07365 2.115e-13
 [ reached 'max' / getOption("max.print") -- omitted 2 rows ]
# Combine colData content with PCA scores This idea comes from the following
# post:
# https://hds-sandbox.github.io/bulk_RNAseq_course/develop/06_exploratory_analysis.html
meta <- as.data.frame(colData(counts_vst))
df <- cbind(meta, pca$x)

# Plot PCA score results
p1 <- ggplot(df) + geom_point(aes(x = PC1, y = PC2, color = condition)) + theme_bw() +
    ggtitle("PCA per sample")

# Use of ggfortify package to plot PCA results
p2 <- autoplot(pca, data = t(counts_vst_mat), label = TRUE, label.size = 2) + ggtitle("PCA per sample") +
    theme(plot.title = element_text(hjust = 0.5))

# Arrange subplots
grid.arrange(p1, p2, ncol = 2)

Las dos componentes (PC1 y PC2) explican aproximadamente el 27% de la variabilidad total.

# Plot PCA score results
ggplot(df) + geom_point(aes(x = PC1, y = PC2, color = tissue)) + theme_bw() + ggtitle("PCA per sample")

3 Differential gene expression with DESeq2

Now after the data exploration we can run the differential expression pipeline on the raw counts with a single call to the function DESeq. The null hypothesis is that there is no systematic difference between the average read count values of the different conditions for a given gene. We will calculate the fold change of read counts, assuming de differences in sequencing depth and variability. We will test normal tissue versus cancerous tissue (normal used as denominator for the fold change calculation)

# Run DGE DESeq2 pipeline
dds_DGE <- DESeq(deseq2_raw)
# Mean-dispersion relationship
par(mfrow = c(1, 1))
plotDispEsts(dds_DGE)

We proceed to extract results obtained from the DESeq2 pipeline to obtain the base means across samples, the log2 fold variation, standard errors, p-value and p-adjusted.

# Get results from the DESeq2 pipeline
dds_DGE_results <- results(dds_DGE)
head(dds_DGE_results, n = 5)
log2 fold change (MLE): condition tumor vs normal 
Wald test p-value: condition tumor vs normal 
DataFrame with 5 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
ENSG00000000003 3583.2431       0.535005 0.1701870   3.14363 1.66865e-03
ENSG00000000005   36.6453       0.444801 0.4429658   1.00414 3.15309e-01
ENSG00000000419 1944.2916       1.410763 0.2075019   6.79879 1.05499e-11
ENSG00000000457  564.2582      -0.342305 0.0938953  -3.64560 2.66763e-04
ENSG00000000460  259.4360       1.826446 0.1806773  10.10888 5.04556e-24
                       padj
                  <numeric>
ENSG00000000003 4.82369e-03
ENSG00000000005 4.31869e-01
ENSG00000000419 1.18971e-10
ENSG00000000457 9.23596e-04
ENSG00000000460 3.54200e-22
# Inspect the meaning of the columns from the results object
mcols(dds_DGE_results, use.names = TRUE)
DataFrame with 6 rows and 2 columns
                       type            description
                <character>            <character>
baseMean       intermediate mean of normalized c..
log2FoldChange      results log2 fold change (ML..
lfcSE               results standard error: cond..
stat                results Wald statistic: cond..
pvalue              results Wald test p-value: c..
padj                results   BH adjusted p-values

The results object contains the following features: baseMean, log2FoldChange, lfcSE, stat, pvalue, and padj. Let’s get a summary of the results object. Now, let’s obtain a summary of the results object.

# Summary of DGE
summary(dds_DGE_results)

out of 37214 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 9181, 25%
LFC < 0 (down)     : 8838, 24%
outliers [1]       : 0, 0%
low counts [2]     : 2911, 7.8%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

After filtering those genes with p-adjusted < 0.05, we obtained 16.013 significant genes.

# Filter genes with p-adjusted value < 0.05 The results object behaves like a
# dataframe
table(dds_DGE_results$padj < 0.05)

FALSE  TRUE 
18315 16013 

and when we order those significant genes by p-adjusted value:

# Select significant genes with a p-adjusted value lower than 0.05
resSig <- subset(dds_DGE_results, padj < 0.05)

# Order significant genes by p-adjusted value
resSig <- resSig[order(resSig$padj), ]

head(resSig, n = 5)
log2 fold change (MLE): condition tumor vs normal 
Wald test p-value: condition tumor vs normal 
DataFrame with 5 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat       pvalue
                <numeric>      <numeric> <numeric> <numeric>    <numeric>
ENSG00000175832   777.647        5.90807  0.272206   21.7044 1.86433e-104
ENSG00000164379   554.655        6.81304  0.378534   17.9985  2.00230e-72
ENSG00000173898   335.153        3.84377  0.224649   17.1101  1.24719e-65
ENSG00000287828   117.881        3.74330  0.230518   16.2387  2.68768e-59
ENSG00000111110   475.615        2.84099  0.179185   15.8550  1.29783e-56
                        padj
                   <numeric>
ENSG00000175832 6.39988e-100
ENSG00000164379  3.43674e-68
ENSG00000173898  1.42712e-61
ENSG00000287828  2.30657e-55
ENSG00000111110  8.91036e-53

Let’s check those genes with the strongest down-regulation and up-regulation, respectively.

# Order significant genes with the strongest down-regulation
head(resSig[order(resSig$log2FoldChange), ])
log2 fold change (MLE): condition tumor vs normal 
Wald test p-value: condition tumor vs normal 
DataFrame with 6 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
ENSG00000181092  37.66103       -8.31119  0.921877  -9.01551 1.95960e-19
ENSG00000034971  41.97610       -8.07256  0.646214 -12.49208 8.24701e-36
ENSG00000167916  26.97151       -6.97326  0.764266  -9.12413 7.23165e-20
ENSG00000286765  17.45913       -6.93212  1.268944  -5.46290 4.68415e-08
ENSG00000241158  43.04560       -6.81121  0.723510  -9.41412 4.77056e-21
ENSG00000298053   7.38357       -6.48845  0.732942  -8.85260 8.55032e-19
                       padj
                  <numeric>
ENSG00000181092 6.81550e-18
ENSG00000034971 3.87813e-33
ENSG00000167916 2.67797e-18
ENSG00000286765 3.09432e-07
ENSG00000241158 2.12680e-19
ENSG00000298053 2.67806e-17
# Order significant genes with the strongest up-regulation
head(resSig[order(resSig$log2FoldChange, decreasing = TRUE), ])
log2 fold change (MLE): condition tumor vs normal 
Wald test p-value: condition tumor vs normal 
DataFrame with 6 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
ENSG00000298768   65.6382        8.92977  0.826433  10.80520 3.25260e-27
ENSG00000185269  222.6795        8.72954  0.647884  13.47391 2.22742e-41
ENSG00000260573   48.3035        8.62083  0.801538  10.75536 5.59134e-27
ENSG00000230316   38.2770        8.41512  0.973220   8.64667 5.30221e-18
ENSG00000214039  301.1986        8.23946  0.561699  14.66882 1.02098e-48
ENSG00000167755  159.7233        8.20514  0.846924   9.68817 3.38539e-22
                       padj
                  <numeric>
ENSG00000298768 3.75943e-25
ENSG00000185269 2.83196e-38
ENSG00000260573 6.13226e-25
ENSG00000230316 1.47499e-16
ENSG00000214039 3.18620e-45
ENSG00000167755 1.80456e-20
# Save significant DGE results to file
write.table(resSig, file = "DESeq2_sig_genes_normal_vs_cancer_44_locrc.tab", sep = "\t",
    quote = FALSE, row.names = FALSE)

3.0.1 Visualization of results

3.0.1.1 Individual gene count plot

To visualize the counts for ENSG00000175832 gene, which showed the lowest p-adjusted value of 6.39988e-100, we’ll use the plotCounts function over the feature condition to get the normalized counts for this gene.

BiocManager::install("ggbeeswarm")
library("ggbeeswarm")

# Set a 1 by 2 plot frame par(mfrow=c(1,2))

# Individual gene raw normalized counts
plotCounts(dds_DGE, gene = "ENSG00000175832", intgroup = c("condition"))

# Individual gene raw counts using ggplot2
geneCounts <- plotCounts(dds_DGE, gene = "ENSG00000175832", intgroup = c("condition",
    "tissue"), returnData = TRUE)

ggplot(geneCounts, aes(x = condition, y = count, color = tissue)) + geom_beeswarm(cex = 1)

3.0.1.2 Histogram of p-value frecuencies

# Histogram of p-value frequencies
par(mfrow = c(1, 1))
hist(dds_DGE_results$pvalue, col = "blue", xlab = "", border = "white", ylab = "Frequency",
    breaks = 0:40/40, main = "Frequencies of p-values")

3.0.1.3 MA-plot

*An MA-plot (Dudoit et al. 2002) provides a useful overview for the distribution of the estimated coefficients in the model, e.g. the comparisons of interest, across all genes. On the y-axis, the “M” stands for “minus” – subtraction of log values is equivalent to the log of the ratio – and on the x-axis, the “A” stands for “average”. This plot allows us to evaluate fold changes and the distribution around the mean expression.

Before making the MA-plot, we use the lfcShrink function to shrink the log2 fold changes for the comparison of dex treated vs untreated samples. There are three types of shrinkage estimators in DESeq2, which are covered in the DESeq2 vignette. log2 fold shrinkage remove the noise preserving the large differences.*

# Install modules and libraries Log^2^ fold shrinkage
BiocManager::install("apeglm")
library("apeglm")

par(mfrow = c(1, 2))

# MA-plot without shrinkage
plotMA(dds_DGE_results, alpha = 0.05, main = "Normal vs. Tumor", ylim = c(-5, 5))

# MA-plot with shrinkage Check possible contrast to create the MA-plot
resultsNames(dds_DGE)
[1] "Intercept"                 "condition_tumor_vs_normal"
res <- lfcShrink(dds_DGE, coef = "condition_tumor_vs_normal", type = "apeglm")
plotMA(res, alpha = 0.05, main = "Normal vs. Tumor log2 shrink", ylim = c(-5, 5))

3.0.1.4 Volcano plot

Volcano plot helps to visualize significant differential expressed genes showing the log2 fold change and the significance (p-adjusted).

# Volcano plot of significant genes Normal vs Tumor Good resources to
# understand volcano plots: https://biostatsquid.com/volcano-plot/

# Order by adjusted p-value
dds_DGE_volcano <- dds_DGE_results[order(dds_DGE_results$padj), ]

##############################################################################

# Load modules and libraries
BiocManager::install("AnnotationDbi")
BiocManager::install("org.Hs.eg.db")
library("AnnotationDbi")
library("org.Hs.eg.db")

# Add gene symbol column
dds_DGE_volcano$SYMBOL = mapIds(org.Hs.eg.db, keys = rownames(dds_DGE_volcano), column = "SYMBOL",
    keytype = "ENSEMBL", multiVals = "first")

# Add gene name column
dds_DGE_volcano$GENENAME = mapIds(org.Hs.eg.db, keys = rownames(dds_DGE_volcano),
    column = "GENENAME", keytype = "ENSEMBL", multiVals = "first")

# Add gene Entrez ID column
dds_DGE_volcano$ENTREZID = mapIds(org.Hs.eg.db, keys = rownames(dds_DGE_volcano),
    column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")

# Add gene EGG functional pathway column
dds_DGE_volcano$PATH = mapIds(org.Hs.eg.db, keys = rownames(dds_DGE_volcano), column = "PATH",
    keytype = "ENSEMBL", multiVals = "first")


##############################################################################

# Categorize genes: Up, Down, Not Sig
results_order <- as.data.frame(dds_DGE_volcano)
results_order$category <- dplyr::case_when(results_order$padj < 0.05 & results_order$log2FoldChange >
    0 ~ "Upregulated", results_order$padj < 0.05 & results_order$log2FoldChange <
    0 ~ "Downregulated", TRUE ~ "Not significant")

# Volcano plot
p <- ggplot2::ggplot(results_order, ggplot2::aes(x = log2FoldChange, y = -log10(padj))) +
    ggplot2::geom_point(ggplot2::aes(color = category)) + ggplot2::scale_color_manual(values = c(Upregulated = "red",
    Downregulated = "blue", `Not significant` = "grey")) + ggplot2::ggtitle("Significant genes Normal vs. Tumor")
# Install modules and libraries Get ENSEMBL gene names Grepel library avoids
# gene names overlaping
BiocManager::install("ggrepel")
library(ggrepel)

# Select genes with |log2FC| > 1 and padj < 0.05
DEgenes_DESeq <- results_order[abs(results_order$log2FoldChange) > log2(2) & results_order$padj <
    0.05, ]

# Sort by adj p-value
DEgenes_DESeq <- DEgenes_DESeq[order(DEgenes_DESeq$padj), ]

# Take up to 10 genes
top30 <- head(DEgenes_DESeq, 30)

# Add labels using rownames of the subset aes(label = rownames(top10)),
p + geom_text_repel(data = top30, aes(label = top30$SYMBOL), size = 2, max.overlaps = 100)

4 Annotation of results

In this section, we will add the gene symbol, gene name, KEGG pathway, and Entrez ID to our dataset to provide more informative annotations for the DESeq analysis, which currently uses ENSEMBL gene ID nomenclature (e.g., ENSG00000175832).

# Load modules and libraries
BiocManager::install("AnnotationDbi")
BiocManager::install("org.Hs.eg.db")
library("AnnotationDbi")
library("org.Hs.eg.db")
# Check all potential terms that could be retrieved with AnnotationDbi using
# mapsIds()
columns(org.Hs.eg.db)
 [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
 [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
[11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
[16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
[21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
[26] "UNIPROT"     
# Review the most significant genes from our study Previously, we've saved a in
# a .tab file the significant DGE results file =
# 'DESeq2_sig_genes_normal_vs_cancer_44_locrc.tab'
head(resSig)
log2 fold change (MLE): condition tumor vs normal 
Wald test p-value: condition tumor vs normal 
DataFrame with 6 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat       pvalue
                <numeric>      <numeric> <numeric> <numeric>    <numeric>
ENSG00000175832   777.647        5.90807  0.272206   21.7044 1.86433e-104
ENSG00000164379   554.655        6.81304  0.378534   17.9985  2.00230e-72
ENSG00000173898   335.153        3.84377  0.224649   17.1101  1.24719e-65
ENSG00000287828   117.881        3.74330  0.230518   16.2387  2.68768e-59
ENSG00000111110   475.615        2.84099  0.179185   15.8550  1.29783e-56
ENSG00000165816   447.515        4.61192  0.301498   15.2967  8.04376e-53
                        padj
                   <numeric>
ENSG00000175832 6.39988e-100
ENSG00000164379  3.43674e-68
ENSG00000173898  1.42712e-61
ENSG00000287828  2.30657e-55
ENSG00000111110  8.91036e-53
ENSG00000165816  4.60210e-49

We proceed by adding annotations starting from ENSEMBL to Entrez and from there to KEGG pathways, gene symbols and gene names. This is because, all major annotation databases (KEGG, org.Hs.eg.db, clusterProfiler) are indexed by Entrez, so doing the conversion first dramatically reduces NA values and gives more complete annotations.

# Add gene symbol column
resSig$SYMBOL = mapIds(org.Hs.eg.db, keys = rownames(resSig), column = "SYMBOL",
    keytype = "ENSEMBL", multiVals = "first")

# Add gene name column
resSig$GENENAME = mapIds(org.Hs.eg.db, keys = rownames(resSig), column = "GENENAME",
    keytype = "ENSEMBL", multiVals = "first")

# Add gene Entrez ID column
resSig$ENTREZID = mapIds(org.Hs.eg.db, keys = rownames(resSig), column = "ENTREZID",
    keytype = "ENSEMBL", multiVals = "first")

# Add gene EGG functional pathway column
resSig$PATH = mapIds(org.Hs.eg.db, keys = rownames(resSig), column = "PATH", keytype = "ENSEMBL",
    multiVals = "first")

# Add gene ENSEMBL ID column
resSig$ENSEMBL <- rownames(resSig)

# Convert to dataframe
resSig_df <- as.data.frame(resSig)
print(head(resSig_df), 5)
                baseMean log2FoldChange     lfcSE     stat        pvalue
ENSG00000175832 777.6471       5.908068 0.2722060 21.70440 1.864333e-104
ENSG00000164379 554.6548       6.813042 0.3785342 17.99848  2.002298e-72
ENSG00000173898 335.1535       3.843773 0.2246489 17.11013  1.247189e-65
ENSG00000287828 117.8814       3.743298 0.2305178 16.23865  2.687679e-59
                         padj SYMBOL                           GENENAME
ENSG00000175832 6.399884e-100   ETV4 ETS variant transcription factor 4
ENSG00000164379  3.436743e-68  FOXQ1                    forkhead box Q1
ENSG00000173898  1.427116e-61 SPTBN2  spectrin beta, non-erythrocytic 2
ENSG00000287828  2.306566e-55   <NA>                               <NA>
                ENTREZID PATH         ENSEMBL
ENSG00000175832     2118 <NA> ENSG00000175832
ENSG00000164379    94234 <NA> ENSG00000164379
ENSG00000173898     6712 <NA> ENSG00000173898
ENSG00000287828     <NA> <NA> ENSG00000287828
 [ reached 'max' / getOption("max.print") -- omitted 2 rows ]
# Save data frame including annotations
write.table(resSig_df, file = ("DESeq2_sig_genes_normal_vs_cancer_44_locrc_annotated.tab"),
    row.names = F, col.names = TRUE, sep = "\t")

I see many NAs under PATH, is there an alternative as for example KEGGREST.

4.1 Functional Gene Ontology (GO) Analysis

# Load modules and libraries
BiocManager::install("clusterProfiler")
BiocManager::install("ggnewscale")

library("clusterProfiler")
library("enrichplot")
library("ggnewscale")
# Assign the human gene annotation database
org_db <- org.Hs.eg.db

# Create an gene vector containing ENTREZIDs
genes <- as.character(resSig$ENTREZID)

# Run GO classification using groupGO It answers the question: How many DE
# genes belong to each GO category?  It classifies genes into GO categories at
# a particular hierarchical level CC = cel comp, MF = mol func, and BP = bio
# process
ggo <- clusterProfiler::groupGO(gene = genes, OrgDb = org_db, ont = "BP", level = 3,
    readable = TRUE)

# Convert to dataframe
ggo_df <- as.data.frame(ggo)

# Visualize the last columns
head(ggo_df[-5])
                   ID                                 Description Count
GO:0001776 GO:0001776                       leukocyte homeostasis    74
GO:0002200 GO:0002200 somatic diversification of immune receptors    48
GO:0002252 GO:0002252                     immune effector process   374
GO:0002253 GO:0002253               activation of immune response   362
GO:0002262 GO:0002262                    myeloid cell homeostasis   133
GO:0002339 GO:0002339                            B cell selection     4
           GeneRatio
GO:0001776  74/12698
GO:0002200  48/12698
GO:0002252 374/12698
GO:0002253 362/12698
GO:0002262 133/12698
GO:0002339   4/12698
# Horizontal plot of cell component categories
barplot(ggo, drop = TRUE, showCategory = 30, font.size = 5)

# Run GO over-representation test enrichGO
ego <- clusterProfiler::enrichGO(gene = genes, OrgDb = org_db, ont = "BP", pAdjustMethod = "BH",
    pvalueCutoff = 0.05, qvalueCutoff = 0.05, readable = TRUE)

# Horizontal plot of biological process categories Alternative representations
# https://yulab-smu.top/biomedical-knowledge-mining-book/enrichplot.html
barplot(ego, drop = TRUE, showCategory = 30, font.size = 5)

# Convert to dataframe
ego_df <- as.data.frame(ego)

# Visualize the last columns
head(ego_df, 1)
                   ID         Description GeneRatio   BgRatio RichFactor
GO:0042254 GO:0042254 ribosome biogenesis  262/9826 318/18860  0.8238994
           FoldEnrichment   zScore      pvalue    p.adjust       qvalue
GO:0042254        1.58139 10.90463 4.31979e-30 2.75473e-26 2.114878e-26
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               geneID
GO:0042254 DKC1/EXOSC2/UTP25/HEATR1/NLE1/NOP58/DDX55/RPL7L1/BOP1/DDX31/NAT10/RIOK3/LYAR/NOL8/GTF3A/WDR75/DDX10/WDR43/EXOSC5/WDR12/RPP40/WDR3/UTP14A/UTP20/BRIX1/PRKDC/PA2G4/GAR1/SUV39H1/RRP15/KAT2B/SPOUT1/NOP14/DDX27/URB1/LAS1L/NPM3/DCAF13/TMA16/MTERF3/RRP9/NOB1/NSUN5/PDCD11/DDX18/WDR36/PNO1/RIOK1/GTPBP4/LTV1/RRS1/AIRIM/RNASEL/EXOSC8/UTP18/BYSL/SRFBP1/EXOSC3/EXOSC7/WDR46/NPM1/NOL6/RPF2/DDX56/FBL/ERCC2/RIOX2/UTP4/RPUSD4/EIF1AX/NOLC1/GTPBP10/DHX37/RAN/NOP2/UTP23/PAK1IP1/SDAD1/NHP2/DDX21/MYBBP1A/MPHOSPH10/NOL10/TSR1/XPO1/URB2/BMS1/PES1/MTG2/RRP1B/RCL1/NOP56/UTP15/MRM2/SIRT7/RPUSD1/BUD23/DIS3/GEMIN4/WDR74/EBNA1BP2/CUL4A/PPAN/TRMT61B/CHD7/DDX28/GNL3L/NOM1/TFB1M/MRTO4/ZNHIT6/DDX52/SURF6/DIMT1/MAK16/NOC4L/NOL11/RRP1/NOC2L/AATF/POP7/RPS21/HEATR3/EXOSC9/IMP4/FASTKD2/C1QBP/UTP6/PWP1/NIP7/DDX54/XRN2/EMG1/EIF5B/DHX30/KRI1/NUDT16/NUP88/DDX51/ZCCHC4/REXO4/TRMT2B/METTL5/TRMT112/WDR18/KRR1/NAF1/NMD3/NOP16/MDN1/RRP8/RPS19/TFB2M/AFG2A/METTL16/NOL7/MRPS2/MALSU1/LSM6/XRCC5/ISG20/REXO5/RPS4X/RPL10L/RPS7/GNL2/ZNF593/ISG20L2/DDX47/EIF2A/GRWD1/EIF5/RPL7A/RPL35/RBIS/ESF1/PELP1/CUL4B/TBL3/NSUN3/UTP3/RPS6/RPL14/UTP11/ABT1/RPS5/ERI1/RPLP0/MYG1/SNU13/RPL26L1/EFL1/RPLP0P6/RPL7/RRP7BP/RBFA/USP36/EXOSC4/PIN4/RPUSD2/ERAL1/RPS15/FTSJ3/FSAF1/RPL38/DDX49/RPS16/MPHOSPH6/EIF6/MTREX/LSG1/MRM1/RPSA/DNTTIP2/CDKN2A/POP5/GLUL/RRP7A/RCC1L/EIF4A3/PIH1D2/RPL27/SBDS/RPS9/RPS11/RPS12/RPL24/RPL35A/RPS17/DROSHA/RPS27A/RRN3/ZNF622/MCAT/EXOSC6/EXOSC1/RPL5/RPS15A/RPS24/TSR2/AFG2B/RPS8/NSA2/RPS23/RPS14/WBP11/NGDN/NOA1/FCF1/RBM34/ZNHIT3/RSL24D1/RIOK2/RPP38/TENT4B/SLX9/RPS25/RRP36/RPS28/RPP30/RPSA2/MRM3
           Count
GO:0042254   262
# Get dataframe dimmensions
dim(ego_df)
[1] 869  12

4.2 Gene-Concept Network and Enrichment Map

The Gene-Concept Network displays which genes are associated with which biological processes, making it easy to see gene-function relationships. Meanwhile, the enrichment map groups similar terms together. Terms that share genes are connected, making related functions easy to spot.

# Load modules and libraries
library(DOSE)
# Convert gene ID to Symbol
egox <- setReadable(ego, "org.Hs.eg.db", "ENTREZID")

# Generate geneList with Log2FoldChange data
geneList <- resSig$log2FoldChange
names(geneList) <- as.character(unique(resSig$ENTREZID))
geneList <- sort(geneList, decreasing = TRUE)

head(geneList)
     5333    147111      4857     10004 100190940    401093 
 8.929771  8.729536  8.620832  8.415120  8.239463  8.205140 
# Network plot of enriched terms CategorySize can be scaled by 'pvalue' or
# 'geneNum'
p1 <- cnetplot(egox, categorySize = "pvalue", foldChange = geneList)

# Display the plot
print(p1)

# Calculate pairwise similarities of enriched terms
egox_pairwise <- pairwise_termsim(egox)

# Creates an enrichment map visualization from pairwise enrichment analysis
# results
p2 <- emapplot(egox_pairwise, layout = "kk")

# Display the plot
print(p2)

# Plot Gene-Concept Network and Enrichment Map in a PDF file Start PDF device
pdf(file = paste("Merge_GO_Gene_Set_Enrichment_Analysis", "CC", ".pdf", sep = "_"),
    width = 25, height = 10)

# Create the plot
cowplot::plot_grid(p1, p2, ncol = 2, labels = LETTERS[1:3], rel_widths = c(0.8, 0.8,
    1.2))

# Close PDF device
dev.off()
quartz_off_screen 
                2